from IPython.display import Image
Image("clouds.png")
The keywords that were searched most frequently were examined using text mining methods (konlpy for Korean keywords and ntlk/word cloud for English keywords).
Using wordcloud, the most frequent keywords were visualized separately in Korean and English.
# Import required libraries
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv('keywords.csv')
df.head()
df.shape
import re
def apply_regular_expression(text):
hangul = re.compile('[^ ㄱ-ㅣ 가-힣]')
result = hangul.sub('', text)
return result
apply_regular_expression(df['Keyword'][0])
from konlpy.tag import Okt
from collections import Counter
okt = Okt()
nouns = okt.nouns(apply_regular_expression(df['Keyword'][0]))
nouns
# Create corpus
corpus = "".join(df['Keyword'].tolist())
# Apply regular expressions
apply_regular_expression(corpus)
# Extract Tokens form Corpus
nouns = okt.nouns(apply_regular_expression(corpus))
counter = Counter(nouns)
counter.most_common(10)
#Delete Stopwords
available_counter = Counter({x: counter[x] for x in counter if len(x) > 1})
available_counter.most_common(10)
#Delete Stopwords from an external dictionary
stopwords = pd.read_csv("https://raw.githubusercontent.com/yoonkt200/FastCampusDataset/master/korean_stopwords.txt").values.tolist()
!pip install wordcloud
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from wordcloud import WordCloud, STOPWORDS
import numpy as np
import nltk
from PIL import Image
text = open('keywords.csv').read()
circle = np.array(Image.open('circle_1.jpeg'))
stopwords = set(STOPWORDS)
stopwords.add("said")
print('The number of stopwords:',len(nltk.corpus.stopwords.words('english')))
print(nltk.corpus.stopwords.words('english')[:40])
plt.figure(figsize=(8,8))
plt.imshow(circle, cmap=plt.cm.gray, interpolation='bilinear')
plt.axis('off')
plt.show()
# Install Korean fonts
!apt-get update -qq
!apt-get install fonts-nanum* -qq
text = open('keywords.csv').read()
circle = np.array(Image.open('circle_1.jpeg'))
stopwords = set(STOPWORDS)
stopwords.add("said")
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
wc = WordCloud(font_path = fontpath, background_color='white', max_words=2000, mask=circle,
stopwords = stopwords)
wc = wc.generate(text)
wc.words_
plt.figure(figsize=(12,12))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()
# Finding English Keywords
# Defining Data
list_eng = []
#Delete Special cases
def cleanText(readData):
text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]', ' ', readData)
return text
for i in df['Keyword']:
# Deleting Korean words
text = re.sub('[^a-zA-Z]',' ',i).strip()
#Delete Special cases
text = cleanText(text)
#Capitalize all words
text = text.upper()
# Deleting empty rows
if(text != ''):
list_eng.append(text)
print(list_eng)
# dictionary of lists
dict = {'name': list_eng}
df = pd.DataFrame(dict)
# saving the dataframe
df.to_csv('eng_keywords.csv')
corpus = pd.DataFrame(list_eng)
text = " ".join(corpus[0].tolist())
text
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
print(tokenized_word)
from nltk.probability import FreqDist
fdist = FreqDist(tokenized_word)
print(fdist)
fdist.most_common(10)
# Frequency Distribution Plot
import matplotlib.pyplot as plt
fdist.plot(30,cumulative=False)
plt.show()
#removing stopwords
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))
word_tokens = word_tokenize(text)
result = []
for token in word_tokens:
if token not in stop_words:
result.append(token)
text = result
counter = Counter(text)
counter.most_common(10)
#Delete Stopwords
available_counter = Counter({x: counter[x] for x in counter if len(x) > 1})
available_counter.most_common(10)
text_eng = open('eng_keywords.csv').read()
circle = np.array(Image.open('circle_1.jpeg'))
stopwords = set(STOPWORDS)
wc_e = WordCloud(collocations=False, background_color='white', max_words=2000, mask=circle, stopwords=stopwords)
wc_e = wc_e.generate(text_eng)
wc_e.words_
plt.figure(figsize=(10,10))
plt.imshow(wc_e, interpolation='bilinear')
plt.axis('off')
plt.show()